{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Using BookNLP folder path from config.py -- /Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/booknlp-output-poetry/\n",
      "PLACE LEXICON has 1560 entries.\n",
      "\n",
      "----\n",
      "Using the two BookNLP paths: /Users/sunyambagga/Desktop/txtLAB-2/minimal-narrativity/booknlp-output-narrativity/ \n",
      " /Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/booknlp-output-poetry/ \n",
      "----\n",
      "\n",
      "Loading TMV features pickle... Size: 167646\n",
      "Loading TMV features pickle... Size: 167527\n",
      "Top POS-TMV features: ['agenthood', 'vbd', 'nn', 'vbz', 'concreteness', '-rrb-', '-lrb-', 'jj', 'in', 'prp', 'dt', 'eventfulness', 'nns', 'setting', 'temporality', 'vbn', 'agency', 'vbp', 'cc', 'cd', 'feltness', 'wdt', 'coherence', 'nnp', 'md', 'rp', 'pct_quoted', 'temporal_order', 'to', 'vb', 'saying', 'rb', \"'\", 'pos', 'vbg', 'wp', '``', 'wrb', 'nnps', 'jjs', 'ex', 'jjr', 'rbr', 'rbs', 'sym', 'ls', 'pdt', 'fw', 'uh']\n",
      "\n",
      "----\n",
      "Using the two BookNLP paths: /Users/sunyambagga/Desktop/txtLAB-2/minimal-narrativity/booknlp-output-narrativity/ \n",
      " /Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/booknlp-output-poetry/ \n",
      "----\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import data_loader\n",
    "import vectorizer\n",
    "import best_model\n",
    "\n",
    "import ast\n",
    "import pandas as pd\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.svm import SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(401, 12)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "      <th>ID</th>\n",
       "      <th>Within401_PosTense_Prob_Narrative</th>\n",
       "      <th>Within401_AllCategories_Prob_Narrative</th>\n",
       "      <th>Within401_PosMood_Prob_Narrative</th>\n",
       "      <th>Within401_PosUnigrams_Prob_Narrative</th>\n",
       "      <th>Within401_PosTMVQuoted_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\r\\n“Dinnseanchus, ” ...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.012</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232556</td>\n",
       "      <td>0.100</td>\n",
       "      <td>0.171560</td>\n",
       "      <td>0.224250</td>\n",
       "      <td>0.150701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.004</td>\n",
       "      <td>4</td>\n",
       "      <td>0.236963</td>\n",
       "      <td>0.182</td>\n",
       "      <td>0.248461</td>\n",
       "      <td>0.231297</td>\n",
       "      <td>0.234420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "\n",
       "                                                TEXT  Avg_Reader_Score  \\\n",
       "0  It is referred to in the \\r\\n“Dinnseanchus, ” ...          2.000000   \n",
       "1  It is said to be imbricated. Compare the peria...          1.333333   \n",
       "\n",
       "   Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                    0.10   \n",
       "1                                    0.19   \n",
       "\n",
       "   Trained12k_5features_Prob_Narrative  ID  Within401_PosTense_Prob_Narrative  \\\n",
       "0                                0.012   3                           0.232556   \n",
       "1                                0.004   4                           0.236963   \n",
       "\n",
       "   Within401_AllCategories_Prob_Narrative  Within401_PosMood_Prob_Narrative  \\\n",
       "0                                   0.100                          0.171560   \n",
       "1                                   0.182                          0.248461   \n",
       "\n",
       "   Within401_PosUnigrams_Prob_Narrative  Within401_PosTMVQuoted_Prob_Narrative  \n",
       "0                              0.224250                               0.150701  \n",
       "1                              0.231297                               0.234420  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotated_df = pd.read_csv('/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/results/Predictions_Top5Models.tsv', delimiter='\\t')\n",
    "print(annotated_df.shape)\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>genre</th>\n",
       "      <th>filename</th>\n",
       "      <th>avg_overall</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>- .~\\r\\nhi -\\r\\nfâ€˜. w\\r\\nHUMMINGBIRDS. 215\\r...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086857689-norm.txt</td>\n",
       "      <td>2.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>,22 For ndvertisinir J'Tid other expermos inci...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_dul1-ark-13960-t2q53cp26-norm.txt</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text      genre  \\\n",
       "0  - .~\\r\\nhi -\\r\\nfâ€˜. w\\r\\nHUMMINGBIRDS. 215\\r...  19CNONFIC   \n",
       "1  ,22 For ndvertisinir J'Tid other expermos inci...  19CNONFIC   \n",
       "\n",
       "                                         filename  avg_overall  \n",
       "0             19CNONFIC_5S_chi-086857689-norm.txt     2.666667  \n",
       "1  19CNONFIC_5S_dul1-ark-13960-t2q53cp26-norm.txt     1.000000  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotated_df = pd.read_csv('/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/data/annotated_dataset_401.csv')\n",
    "annotated_df = annotated_df[['text', 'genre', 'filename', 'avg_overall']]\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "fnames, Y = data_loader.load_annotated_data(threshold=2.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_params(algo_name, name):\n",
    "#     fname = 'WithQuotationMarksCountVec_ReaderAnnotated_WithPctQuoted_WithoutTemporalOrder_'+algo_name+'__5_foldcv.txt'\n",
    "    fname = algo_name+'__5_foldcv.txt'\n",
    "    df = pd.read_csv('../../new-results/'+fname, delimiter='\\t')\n",
    "    return ast.literal_eval(df.loc[df['Feature']==name]['Parameters'].values[0])\n",
    "\n",
    "def run_experiments(algo_name, name):\n",
    "    \"\"\"\n",
    "    Run experiments for different feature categories as specified by \"name\".\n",
    "    \"\"\"\n",
    "    if name == 'pos1':\n",
    "        funct = vectorizer.pos_unigrams\n",
    "    elif name == 'pos2':\n",
    "        funct = vectorizer.pos_bigrams\n",
    "    elif name == 'pos3':\n",
    "        funct = vectorizer.pos_trigrams\n",
    "    elif name == 'pos23':\n",
    "        funct = vectorizer.pos_bitri_grams\n",
    "\n",
    "    elif name == 'word1':\n",
    "        funct = vectorizer.word_unigrams\n",
    "    elif name == 'word2':\n",
    "        funct = vectorizer.word_bigrams\n",
    "    elif name == 'word3':\n",
    "        funct = vectorizer.word_trigrams\n",
    "    elif name == 'word23':\n",
    "        funct = vectorizer.word_bitri_grams\n",
    "        \n",
    "    elif name == 'dep1':\n",
    "        funct = vectorizer.dep_unigrams\n",
    "    elif name == 'dep2':\n",
    "        funct = vectorizer.dep_bigrams\n",
    "    elif name == 'dep3':\n",
    "        funct = vectorizer.dep_trigrams\n",
    "    elif name == 'dep23':\n",
    "        funct = vectorizer.dep_bitri_grams\n",
    "\n",
    "    elif name == 'tense':\n",
    "        funct = vectorizer.tense\n",
    "    elif name == 'mood':\n",
    "        funct = vectorizer.mood\n",
    "    elif name == 'voice':\n",
    "        funct = vectorizer.voice\n",
    "    elif name == 'tense_mood_voice':\n",
    "        funct = vectorizer.tense_mood_voice\n",
    "        \n",
    "    elif name == 'pos_tense':\n",
    "        funct = vectorizer.pos_tense\n",
    "    elif name == 'pos_mood':\n",
    "        funct = vectorizer.pos_mood\n",
    "    elif name == 'pos_voice':\n",
    "        funct = vectorizer.pos_voice\n",
    "    elif name == 'pos_tense_mood_voice':\n",
    "        funct = vectorizer.pos_tmv\n",
    "    elif name == 'pos_tense_mood_voice_quoted':\n",
    "        funct = vectorizer.pos_tmv_quoted\n",
    "        \n",
    "\n",
    "    elif name == 'pos_dep_tense_mood_voice': # pos1 (max=100) + dep1 (max=100) + tense + mood + voice\n",
    "        funct = vectorizer.pos_dep_tmv\n",
    "    elif name == 'all_categories': # pos1 (max=100) + word1 (max=100) + dep1 (max=100) + tense + mood + voice + pct_quoted\n",
    "        funct = vectorizer.all_feature_categories_uni\n",
    "    elif name == 'all_categories_best': # pos1 (max=100) + word1 (max=5000) + dep23 (max=5000) + tense + mood + voice\n",
    "        funct = vectorizer.all_feature_categories\n",
    "        \n",
    "    X, _ = funct(fnames, [])\n",
    "    \n",
    "    if algo_name == \"logreg\":\n",
    "        algo = LogisticRegression()\n",
    "\n",
    "    elif algo_name == \"rf\":\n",
    "        algo = RandomForestClassifier(n_estimators=500, random_state=42)\n",
    "        \n",
    "    elif algo_name == \"svm\":\n",
    "        algo = SVC(probability=True)\n",
    "    \n",
    "    param_dict = get_params(algo_name, name)\n",
    "    print(param_dict, len(fnames), len(Y), X.shape)\n",
    "    algo.set_params(**param_dict) # set the desired hyperparameters\n",
    "\n",
    "    predictions = cross_val_predict(algo, X, Y, cv=5, method='predict')\n",
    "    predictions_prob = cross_val_predict(algo, X, Y, cv=5, method='predict_proba')\n",
    "    \n",
    "    map_fname_prob = {}\n",
    "    for fname, pred, probabilities in zip(fnames, predictions, predictions_prob):\n",
    "        prob = probabilities[1]\n",
    "\n",
    "        if prob > 0.5 and pred != 'POS':\n",
    "            print(prob, pred)\n",
    "        elif prob <= 0.5 and pred != 'NEG':\n",
    "            print(prob, pred)\n",
    "        map_fname_prob[fname] = prob\n",
    "\n",
    "    annotated_df['Within401____'+algo_name+'____'+name] = annotated_df['filename'].map(map_fname_prob)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "features = ['pos1', 'pos2', 'pos3', 'pos23', 'word1', 'word2', 'word3', 'word23', 'dep1', 'dep2', 'dep3', 'dep23',\n",
    "            'tense', 'mood', 'voice', 'tense_mood_voice', 'pos_tense', 'pos_mood', 'pos_voice', \n",
    "            'pos_tense_mood_voice', 'pos_tense_mood_voice_quoted', 'pos_dep_tense_mood_voice', 'all_categories']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pos1\n",
      "{'max_depth': 5} 401 401 (401, 38)\n",
      "--------\n",
      "pos2\n",
      "{'max_depth': None} 401 401 (401, 875)\n",
      "--------\n",
      "pos3\n",
      "{'max_depth': 20} 401 401 (401, 6192)\n",
      "--------\n",
      "pos23\n",
      "{'max_depth': 20} 401 401 (401, 7067)\n",
      "--------\n",
      "word1\n",
      "{'max_depth': 20} 401 401 (401, 8319)\n",
      "--------\n",
      "word2\n",
      "{'max_depth': 5} 401 401 (401, 30516)\n",
      "--------\n",
      "word3\n",
      "{'max_depth': None} 401 401 (401, 39139)\n",
      "--------\n",
      "word23\n",
      "{'max_depth': 5} 401 401 (401, 69655)\n",
      "--------\n",
      "dep1\n",
      "{'max_depth': 20} 401 401 (401, 45)\n",
      "--------\n",
      "dep2\n",
      "{'max_depth': 20} 401 401 (401, 1004)\n",
      "--------\n",
      "dep3\n",
      "{'max_depth': None} 401 401 (401, 6140)\n",
      "--------\n",
      "dep23\n",
      "{'max_depth': None} 401 401 (401, 7144)\n",
      "--------\n",
      "tense\n",
      "{'max_depth': 5} 401 401 (401, 1)\n",
      "--------\n",
      "mood\n",
      "{'max_depth': 5} 401 401 (401, 4)\n",
      "--------\n",
      "voice\n",
      "{'max_depth': 5} 401 401 (401, 4)\n",
      "--------\n",
      "tense_mood_voice\n",
      "{'max_depth': 5} 401 401 (401, 9)\n",
      "--------\n",
      "pos_tense\n",
      "Train -- Other: (401, 1) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 39) & test: (0,)\n",
      "{'max_depth': 5} 401 401 (401, 39)\n",
      "--------\n",
      "pos_mood\n",
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "{'max_depth': 20} 401 401 (401, 42)\n",
      "--------\n",
      "pos_voice\n",
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "{'max_depth': 5} 401 401 (401, 42)\n",
      "--------\n",
      "pos_tense_mood_voice\n",
      "Train -- Other: (401, 9) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 47) & test: (0,)\n",
      "{'max_depth': 5} 401 401 (401, 47)\n",
      "--------\n",
      "pos_tense_mood_voice_quoted\n",
      "Train -- Other: (401, 10) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 48) & test: (0,)\n",
      "{'max_depth': 20} 401 401 (401, 48)\n",
      "--------\n",
      "pos_dep_tense_mood_voice\n",
      "Train -- tmv: (401, 9) | pos: (401, 38) | dep: (401, 45)\n",
      "Test -- tmv: (0,) | pos: (0, 38) | dep: (0, 45)\n",
      "Combined shape - train: (401, 92) & test: (0,)\n",
      "{'max_depth': 5} 401 401 (401, 92)\n",
      "--------\n",
      "all_categories\n",
      "Train -- tmv-quoted: (401, 10) | dep: (401, 45) | pos: (401, 38) | word: (401, 100)\n",
      "Test -- tmv-quoted: (0,) | dep: (0, 45) | pos: (0, 38) | word: (0, 100)\n",
      "Combined shape - train: (401, 193) & test: (0,)\n",
      "{'max_depth': 5} 401 401 (401, 193)\n",
      "--------\n"
     ]
    }
   ],
   "source": [
    "for name in features:\n",
    "    print(name)\n",
    "    run_experiments('rf', name)\n",
    "    print(\"--------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pos1\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 38)\n",
      "--------\n",
      "pos2\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 875)\n",
      "--------\n",
      "pos3\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 6192)\n",
      "--------\n",
      "pos23\n",
      "{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 7067)\n",
      "--------\n",
      "word1\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 8319)\n",
      "--------\n",
      "word2\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 30516)\n",
      "0.7953304180205082 NEG\n",
      "0.8462522769655585 NEG\n",
      "0.4705456884730573 POS\n",
      "0.4597439168267779 POS\n",
      "0.43597427808250305 POS\n",
      "0.6191600596598197 NEG\n",
      "0.6754309884824361 NEG\n",
      "0.5269852324268498 NEG\n",
      "0.3267479275818915 POS\n",
      "0.6544049024491639 NEG\n",
      "0.26305009146029423 POS\n",
      "0.7165603498862678 NEG\n",
      "0.3340963620239407 POS\n",
      "0.4155095599503635 POS\n",
      "0.9370915319307415 NEG\n",
      "0.8755029737462849 NEG\n",
      "0.7214886531202835 NEG\n",
      "0.7119994313894464 NEG\n",
      "0.42508819306217593 POS\n",
      "0.46901829042772114 POS\n",
      "0.8213728268691805 NEG\n",
      "0.6572115135942652 NEG\n",
      "0.28212931319632967 POS\n",
      "0.44033797229478067 POS\n",
      "0.336076894735427 POS\n",
      "0.45859362183139424 POS\n",
      "0.39699844908317206 POS\n",
      "0.26751315688072513 POS\n",
      "0.437178838240328 POS\n",
      "0.6464171405002914 NEG\n",
      "--------\n",
      "word3\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 39139)\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.12000044276173649 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.49982411839071916 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.1969490246346533 POS\n",
      "0.5 POS\n",
      "0.43003942069636397 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.49996062747783904 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5589962624946635 NEG\n",
      "0.9300876066566659 NEG\n",
      "0.49591565577168617 POS\n",
      "0.41633635884166215 POS\n",
      "0.3935743667313319 POS\n",
      "0.25235659046446135 POS\n",
      "0.2728218778824453 POS\n",
      "0.351620074358093 POS\n",
      "0.508179558931343 NEG\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.30954775419949626 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.4915727060936465 POS\n",
      "0.543618987819678 NEG\n",
      "--------\n",
      "word23\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 69655)\n",
      "0.34072472398245823 POS\n",
      "0.699917363786868 NEG\n",
      "0.6132780938201586 NEG\n",
      "0.6526569765859834 NEG\n",
      "0.3091704196874577 POS\n",
      "0.5139976758766275 NEG\n",
      "0.5195937160357266 NEG\n",
      "0.5362318877116357 NEG\n",
      "0.34070800783008975 POS\n",
      "0.5690576085266399 NEG\n",
      "0.5369406853855072 NEG\n",
      "0.1586189588757818 POS\n",
      "0.3941580520174437 POS\n",
      "0.32477119545409805 POS\n",
      "0.6650787774922046 NEG\n",
      "0.7561651043579585 NEG\n",
      "0.4469829950617889 POS\n",
      "0.5099451337181963 NEG\n",
      "0.6373657970767302 NEG\n",
      "0.5241437899506536 NEG\n",
      "0.837655433803054 NEG\n",
      "0.54163931143803 NEG\n",
      "0.7684346051944426 NEG\n",
      "0.6547028407828794 NEG\n",
      "0.5403859556519908 NEG\n",
      "0.48655432142547284 POS\n",
      "0.3643389732854784 POS\n",
      "0.7289935979520897 NEG\n",
      "0.5379234740503932 NEG\n",
      "0.42016579146761623 POS\n",
      "--------\n",
      "dep1\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 45)\n",
      "--------\n",
      "dep2\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 1004)\n",
      "--------\n",
      "dep3\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 6140)\n",
      "--------\n",
      "dep23\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 7144)\n",
      "0.502705134277736 NEG\n",
      "0.5014099904106879 NEG\n",
      "--------\n",
      "tense\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 1)\n",
      "--------\n",
      "mood\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 4)\n",
      "--------\n",
      "voice\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 4)\n",
      "--------\n",
      "tense_mood_voice\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 9)\n",
      "--------\n",
      "pos_tense\n",
      "Train -- Other: (401, 1) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 39) & test: (0,)\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 39)\n",
      "--------\n",
      "pos_mood\n",
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 42)\n",
      "--------\n",
      "pos_voice\n",
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "{'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 42)\n",
      "--------\n",
      "pos_tense_mood_voice\n",
      "Train -- Other: (401, 9) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 47) & test: (0,)\n",
      "{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 47)\n",
      "--------\n",
      "pos_tense_mood_voice_quoted\n",
      "Train -- Other: (401, 10) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 48) & test: (0,)\n",
      "{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 48)\n",
      "--------\n",
      "pos_dep_tense_mood_voice\n",
      "Train -- tmv: (401, 9) | pos: (401, 38) | dep: (401, 45)\n",
      "Test -- tmv: (0,) | pos: (0, 38) | dep: (0, 45)\n",
      "Combined shape - train: (401, 92) & test: (0,)\n",
      "{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 92)\n",
      "0.5000097328836028 NEG\n",
      "--------\n",
      "all_categories\n",
      "Train -- tmv-quoted: (401, 10) | dep: (401, 45) | pos: (401, 38) | word: (401, 100)\n",
      "Test -- tmv-quoted: (0,) | dep: (0, 45) | pos: (0, 38) | word: (0, 100)\n",
      "Combined shape - train: (401, 193) & test: (0,)\n",
      "{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'} 401 401 (401, 193)\n",
      "--------\n"
     ]
    }
   ],
   "source": [
    "for name in features:\n",
    "    print(name)\n",
    "    run_experiments('logreg', name)\n",
    "    print(\"--------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pos1\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 38)\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5138592434482238 NEG\n",
      "--------\n",
      "pos2\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 875)\n",
      "0.5177115998536261 NEG\n",
      "0.5053223810966648 NEG\n",
      "0.5077268819055177 NEG\n",
      "0.5102608669332727 NEG\n",
      "0.5119473445399687 NEG\n",
      "0.5216778733153499 NEG\n",
      "--------\n",
      "pos3\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 6192)\n",
      "0.541028658088063 NEG\n",
      "0.5431838040308588 NEG\n",
      "0.5417579800841517 NEG\n",
      "0.5377435692320696 NEG\n",
      "0.509547245167432 NEG\n",
      "0.5076460545656823 NEG\n",
      "0.5298433392959421 NEG\n",
      "0.5258644766036694 NEG\n",
      "0.5072267222303649 NEG\n",
      "0.525015810124676 NEG\n",
      "0.5238438017861403 NEG\n",
      "0.5182508092815202 NEG\n",
      "0.5145026041201423 NEG\n",
      "0.505409013536002 NEG\n",
      "0.5170961498403003 NEG\n",
      "--------\n",
      "pos23\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 7067)\n",
      "--------\n",
      "word1\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 8319)\n",
      "0.5145714721585536 NEG\n",
      "0.5198682981546452 NEG\n",
      "0.5184376955289142 NEG\n",
      "0.507602989271498 NEG\n",
      "--------\n",
      "word2\n",
      "{'C': 1, 'kernel': 'linear'} 401 401 (401, 30516)\n",
      "0.5261906335561122 NEG\n",
      "0.5074034354168141 NEG\n",
      "0.5076171123155095 NEG\n",
      "0.5223188222057926 NEG\n",
      "0.5333924275347177 NEG\n",
      "0.5531782746489018 NEG\n",
      "0.5247288141702794 NEG\n",
      "0.5285445244213147 NEG\n",
      "0.5411288105035345 NEG\n",
      "0.5100078656273067 NEG\n",
      "0.5361378640687885 NEG\n",
      "0.5126892147103537 NEG\n",
      "0.5316186696483484 NEG\n",
      "0.5657793771727989 NEG\n",
      "0.5217513004529285 NEG\n",
      "0.5368631381460869 NEG\n",
      "0.5690026618617968 NEG\n",
      "0.5642385535245376 NEG\n",
      "0.548666163395887 NEG\n",
      "0.5111058999296698 NEG\n",
      "0.5215452653899778 NEG\n",
      "0.541087483101605 NEG\n",
      "0.5396010750904786 NEG\n",
      "0.5422814057267055 NEG\n",
      "0.5636870464265726 NEG\n",
      "0.5243838799463758 NEG\n",
      "0.5135896612146619 NEG\n",
      "0.5311985934511264 NEG\n",
      "0.5290742214185759 NEG\n",
      "0.5200222362345615 NEG\n",
      "0.5414786150336964 NEG\n",
      "0.51745599658698 NEG\n",
      "0.5127053722603104 NEG\n",
      "0.5418833081325252 NEG\n",
      "0.5608661167980005 NEG\n",
      "0.5548317336900969 NEG\n",
      "0.514486456197629 NEG\n",
      "0.5141276637547895 NEG\n",
      "0.5523735179046092 NEG\n",
      "0.5257180194853579 NEG\n",
      "0.505042810536988 NEG\n",
      "0.5582703752161475 NEG\n",
      "0.5602411102456125 NEG\n",
      "0.533315222833496 NEG\n",
      "0.527797270304101 NEG\n",
      "0.5138866858258219 NEG\n",
      "--------\n",
      "word3\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 39139)\n",
      "0.5107914049997632 NEG\n",
      "0.508686730899345 NEG\n",
      "0.5063861807451826 NEG\n",
      "0.5124995319660938 NEG\n",
      "0.5051146706840663 NEG\n",
      "0.505854559927336 NEG\n",
      "0.5076872426182889 NEG\n",
      "0.5160639757170705 NEG\n",
      "0.5154392385706524 NEG\n",
      "0.5154032522011138 NEG\n",
      "0.5376266970348133 NEG\n",
      "0.5201655380549517 NEG\n",
      "0.5383261674936306 NEG\n",
      "0.5288017104028222 NEG\n",
      "0.5146155441772411 NEG\n",
      "0.5295669364460115 NEG\n",
      "0.5497830614965106 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5207423393174591 NEG\n",
      "0.5201655380549517 NEG\n",
      "0.5205157234028368 NEG\n",
      "0.5184495241697559 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5142895672378408 NEG\n",
      "0.5278912752456609 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5162113642244269 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5202667819266549 NEG\n",
      "0.5480310054258222 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5422422213156672 NEG\n",
      "0.5229480878380448 NEG\n",
      "0.513006764079668 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5434284033110339 NEG\n",
      "0.5097196326828961 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5493545195555286 NEG\n",
      "0.5261408174326213 NEG\n",
      "0.5407767266380894 NEG\n",
      "0.5284053685787776 NEG\n",
      "0.5184722664551281 NEG\n",
      "0.5268862767223191 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5105116385551398 NEG\n",
      "0.5282696045161857 NEG\n",
      "0.5179532679600665 NEG\n",
      "0.5085703966175679 NEG\n",
      "0.5175503041935386 NEG\n",
      "0.5062537672586811 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5071518023204103 NEG\n",
      "0.5439336592919841 NEG\n",
      "0.5065239321409866 NEG\n",
      "0.5054499845669859 NEG\n",
      "0.5058024437366918 NEG\n",
      "0.5050057624624296 NEG\n",
      "0.5070212078773881 NEG\n",
      "0.5074158528246994 NEG\n",
      "0.4926653533525719 POS\n",
      "0.5055445483345815 NEG\n",
      "--------\n",
      "word23\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 69655)\n",
      "0.5449336069605957 NEG\n",
      "0.5563766292131433 NEG\n",
      "0.555936190134677 NEG\n",
      "0.5208027299183666 NEG\n",
      "0.5314100704671135 NEG\n",
      "0.5213429405740532 NEG\n",
      "0.5677971064320955 NEG\n",
      "0.5217756470833042 NEG\n",
      "0.5210395662716694 NEG\n",
      "0.5699619867379248 NEG\n",
      "0.5275940486627125 NEG\n",
      "0.528119952909237 NEG\n",
      "0.5734976637620232 NEG\n",
      "0.555532225732959 NEG\n",
      "0.554192921364403 NEG\n",
      "0.5227447436779239 NEG\n",
      "0.5439065498366908 NEG\n",
      "0.5259874407789246 NEG\n",
      "0.5664638810337057 NEG\n",
      "0.5440417159200206 NEG\n",
      "0.507757333983756 NEG\n",
      "0.5244226412010494 NEG\n",
      "0.5305957704358598 NEG\n",
      "0.5114385267419731 NEG\n",
      "0.5204926225630785 NEG\n",
      "0.5234807231910422 NEG\n",
      "0.524933878316173 NEG\n",
      "0.5186116502064997 NEG\n",
      "0.5410660447042692 NEG\n",
      "0.6302651988142756 NEG\n",
      "0.6080417947158543 NEG\n",
      "0.6044116879226529 NEG\n",
      "0.6299336232597991 NEG\n",
      "0.5795103587118582 NEG\n",
      "0.5312202594624471 NEG\n",
      "0.6471266969053265 NEG\n",
      "0.6046550654620311 NEG\n",
      "0.5260559053021139 NEG\n",
      "0.5627253663930949 NEG\n",
      "0.5184190089963333 NEG\n",
      "0.6440797755278733 NEG\n",
      "0.5714534068006158 NEG\n",
      "0.5963270589994338 NEG\n",
      "0.5080644269674712 NEG\n",
      "0.5119919389428337 NEG\n",
      "0.5835260587196708 NEG\n",
      "0.564514501573663 NEG\n",
      "0.5388061958381527 NEG\n",
      "0.5208216227466586 NEG\n",
      "0.5483787188267262 NEG\n",
      "0.5535977294264811 NEG\n",
      "0.5401892125111742 NEG\n",
      "0.5646651255267426 NEG\n",
      "0.5784673976145602 NEG\n",
      "0.5293690338873223 NEG\n",
      "0.5392589730260513 NEG\n",
      "0.5519505684619825 NEG\n",
      "0.5454761126188261 NEG\n",
      "0.5265459069862598 NEG\n",
      "0.5157287730345144 NEG\n",
      "0.5135198334097134 NEG\n",
      "0.5346878850347129 NEG\n",
      "0.5209972146566423 NEG\n",
      "0.5354802147926139 NEG\n",
      "0.5187422172803098 NEG\n",
      "0.5532120242607754 NEG\n",
      "0.5205718342345362 NEG\n",
      "0.5316009492603753 NEG\n",
      "0.5128171322010643 NEG\n",
      "0.5217856161605351 NEG\n",
      "0.5450749012283844 NEG\n",
      "0.5352539459272891 NEG\n",
      "0.5254183609208988 NEG\n",
      "0.5346672361485195 NEG\n",
      "--------\n",
      "dep1\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 45)\n",
      "0.5 POS\n",
      "0.49346630703937777 POS\n",
      "0.5 POS\n",
      "0.49202998689291016 POS\n",
      "0.5062652040376701 NEG\n",
      "--------\n",
      "dep2\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 1004)\n",
      "0.5 POS\n",
      "--------\n",
      "dep3\n",
      "{'C': 1, 'kernel': 'linear'} 401 401 (401, 6140)\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.4874760184792803 POS\n",
      "0.5 POS\n",
      "0.5078066049979317 NEG\n",
      "0.5 POS\n",
      "--------\n",
      "dep23\n",
      "{'C': 1, 'kernel': 'linear'} 401 401 (401, 7144)\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.48149636934320805 POS\n",
      "--------\n",
      "tense\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 1)\n",
      "0.5194358720696333 NEG\n",
      "0.5194278712783401 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194340522181755 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194338897314192 NEG\n",
      "0.519433228951912 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194321406090826 NEG\n",
      "0.5194308261169085 NEG\n",
      "0.5194244863259163 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194329507289464 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194230631068825 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194319073927439 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194330616659527 NEG\n",
      "0.5194340371780792 NEG\n",
      "0.5194338351716454 NEG\n",
      "0.5194305005714688 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194283459017621 NEG\n",
      "0.5194242739051802 NEG\n",
      "0.5194332600474195 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194287673673383 NEG\n",
      "0.5194331310586474 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194290054115834 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194269911908392 NEG\n",
      "0.5194273871537852 NEG\n",
      "0.5194319769484946 NEG\n",
      "0.5194331971071153 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.5194344757056564 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.519433974446737 NEG\n",
      "0.5194343085353059 NEG\n",
      "0.5194330967960046 NEG\n",
      "0.5194280360005895 NEG\n",
      "0.5194311979240532 NEG\n",
      "0.5194320876053699 NEG\n",
      "0.5194337165173786 NEG\n",
      "0.519428268578085 NEG\n",
      "0.5194358720696333 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204240255328176 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204268259801743 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6203699163743656 NEG\n",
      "0.6204260981800296 NEG\n",
      "0.6204272945040978 NEG\n",
      "0.620428728849831 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6203875394602353 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204117969451625 NEG\n",
      "0.6204135377046363 NEG\n",
      "0.620416295345598 NEG\n",
      "0.6204216578795918 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204106582534541 NEG\n",
      "0.6204283712874444 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204079789806533 NEG\n",
      "0.6204271153022376 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6203898505863392 NEG\n",
      "0.6203947979522769 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6203835049714618 NEG\n",
      "0.6204254926216893 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204137637773737 NEG\n",
      "0.6204151488713004 NEG\n",
      "0.6203940955167502 NEG\n",
      "0.6204312356971184 NEG\n",
      "0.6204249477512238 NEG\n",
      "0.6204217374354751 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204254191074139 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204258408472298 NEG\n",
      "0.6204223371645142 NEG\n",
      "0.6204230679722276 NEG\n",
      "0.6204193984934372 NEG\n",
      "0.620426973879882 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204265088832267 NEG\n",
      "0.6204115230825579 NEG\n",
      "0.6204182495401614 NEG\n",
      "0.6204208899924845 NEG\n",
      "0.6203988158868391 NEG\n",
      "0.6204109540174904 NEG\n",
      "0.6203988158868391 NEG\n",
      "0.6204061034912015 NEG\n",
      "0.6204281872553524 NEG\n",
      "0.6204137637773737 NEG\n",
      "0.6204183815627193 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204146189222397 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.6204203198946152 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344438935187462 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.6204334321746625 NEG\n",
      "0.534445009845872 NEG\n",
      "0.62041689567862 NEG\n",
      "0.5344442150944809 NEG\n",
      "0.6204278126186311 NEG\n",
      "0.534444445154104 NEG\n",
      "0.6204052888190508 NEG\n",
      "0.534442899194425 NEG\n",
      "0.5344416216945347 NEG\n",
      "0.534444256923504 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344444906937645 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344437475936177 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344431164673994 NEG\n",
      "0.5344437875383131 NEG\n",
      "0.5344442199702565 NEG\n",
      "0.5344426505574297 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344427642108791 NEG\n",
      "0.5344446075029922 NEG\n",
      "0.5344438446899537 NEG\n",
      "0.5344403841058848 NEG\n",
      "0.5344444401568316 NEG\n",
      "0.5344443286303998 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344443724710017 NEG\n",
      "0.5344442699049247 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344402413360803 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344438250324133 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344418110943452 NEG\n",
      "0.5344439545202334 NEG\n",
      "0.5344437797529139 NEG\n",
      "0.5344435792932989 NEG\n",
      "0.5344446075029922 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534443763880745 NEG\n",
      "0.5344406861599014 NEG\n",
      "0.5344437557900616 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344425806054681 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344438902830151 NEG\n",
      "0.5344443529595251 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344439715416162 NEG\n",
      "0.534445009845872 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344421487404132 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344444548901693 NEG\n",
      "0.5344441897839255 NEG\n",
      "0.5344441515143656 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344440902049679 NEG\n",
      "0.5344441628082018 NEG\n",
      "0.5344384908695894 NEG\n",
      "0.5344427377916393 NEG\n",
      "0.5344394465841107 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344406536509736 NEG\n",
      "0.5344432216551066 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5344414334638533 NEG\n",
      "0.534445009845872 NEG\n",
      "0.5758906097663076 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758725497440227 NEG\n",
      "0.5758867196447927 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759033716028614 NEG\n",
      "0.5758982651292149 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758734132697444 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.575897006023509 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759022882176238 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759031093220307 NEG\n",
      "0.5759035207295489 NEG\n",
      "0.5758971003635829 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758877669852137 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758859579426621 NEG\n",
      "0.5758863455946399 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758990145099957 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.575900012810958 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758940986339165 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758910835631498 NEG\n",
      "0.5758981905155437 NEG\n",
      "0.5758968121975383 NEG\n",
      "0.5758798997534416 NEG\n",
      "0.575893338836091 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758843467641154 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758608441724687 NEG\n",
      "0.5759031778502929 NEG\n",
      "0.5758981147080539 NEG\n",
      "0.5759037645955432 NEG\n",
      "0.5758877669852137 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758745006724955 NEG\n",
      "0.5759037129153325 NEG\n",
      "0.5758223830000312 NEG\n",
      "0.5759022393245887 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758931058667217 NEG\n",
      "0.5758968121975383 NEG\n",
      "0.5758874297060962 NEG\n",
      "0.5758345663542586 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758946885390573 NEG\n",
      "0.5758994321899394 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5758811338290498 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5759016363104845 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.517889391004216 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5759036053361184 NEG\n",
      "0.5759076664513816 NEG\n",
      "0.5178895835831133 NEG\n",
      "0.5758961859905545 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5758993750075664 NEG\n",
      "0.5178882011416512 NEG\n",
      "0.5759018136675741 NEG\n",
      "0.5178865874921691 NEG\n",
      "0.5178876127480175 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178847637600272 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178898451501913 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178903998006149 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178715906438554 NEG\n",
      "0.5178907018695712 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178907369141407 NEG\n",
      "0.517885962320874 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178892976816674 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178876127480175 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178901941613583 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.517890546210439 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178854007574202 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178868726541365 NEG\n",
      "0.5178834416898228 NEG\n",
      "0.5178898451501913 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178878293095677 NEG\n",
      "0.5178821611702114 NEG\n",
      "0.5178889257948108 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.517885596756609 NEG\n",
      "0.5178895772435596 NEG\n",
      "0.517888024429376 NEG\n",
      "0.5178875312189619 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178899159435326 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178848911595094 NEG\n",
      "0.5178903210116284 NEG\n",
      "0.5178898000194362 NEG\n",
      "0.5178881671455757 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.517886601654022 NEG\n",
      "0.5178884805007022 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178903847354681 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178789246151622 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178917707288737 NEG\n",
      "0.5178761675743138 NEG\n",
      "0.5178917707288737 NEG\n",
      "--------\n",
      "mood\n",
      "{'C': 1, 'kernel': 'linear'} 401 401 (401, 4)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.48000909975221956 POS\n",
      "0.5 POS\n",
      "0.47801435264546843 POS\n",
      "0.48581089504571584 POS\n",
      "0.480190709069345 POS\n",
      "0.48068569420169355 POS\n",
      "0.46789132502139835 POS\n",
      "0.4844555230404596 POS\n",
      "0.46884340917730283 POS\n",
      "0.47928423590922026 POS\n",
      "0.46828189253801095 POS\n",
      "0.47682755158552975 POS\n",
      "0.46967369334755094 POS\n",
      "0.5 POS\n",
      "0.4859776868539229 POS\n",
      "0.4786120191721493 POS\n",
      "0.4918847713057978 POS\n",
      "0.4838436375082063 POS\n",
      "0.5 POS\n",
      "0.4857998522962142 POS\n",
      "0.4909121178659687 POS\n",
      "0.4861135976863314 POS\n",
      "0.48013117974459585 POS\n",
      "0.48204936691892636 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.4913398119660235 POS\n",
      "0.47460084630429367 POS\n",
      "0.4813374705864715 POS\n",
      "0.5 POS\n",
      "0.4855757813660002 POS\n",
      "0.4746926143491405 POS\n",
      "0.47430599264486906 POS\n",
      "0.4768389236440576 POS\n",
      "0.48783410609426037 POS\n",
      "0.5 POS\n",
      "0.49141304451061757 POS\n",
      "0.47825529598096306 POS\n",
      "0.5 POS\n",
      "0.4778424049227417 POS\n",
      "0.4872829431958217 POS\n",
      "0.4907353409078178 POS\n",
      "0.48187861289894085 POS\n",
      "--------\n",
      "voice\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 4)\n",
      "0.39636750559428036 POS\n",
      "0.45257185961115143 POS\n",
      "0.4458804996723488 POS\n",
      "0.42134739246149344 POS\n",
      "0.4521336137627818 POS\n",
      "0.4618128152181708 POS\n",
      "0.4375883876241746 POS\n",
      "0.41161946771498736 POS\n",
      "0.42004681800115745 POS\n",
      "0.43555111335469365 POS\n",
      "0.46877374186464704 POS\n",
      "0.4111394921837442 POS\n",
      "0.49248034534124857 POS\n",
      "0.4467688691121656 POS\n",
      "0.45733815762281993 POS\n",
      "0.4835674390061076 POS\n",
      "0.48953792608419494 POS\n",
      "0.46715003138624783 POS\n",
      "0.4789860988027734 POS\n",
      "0.4493528739714393 POS\n",
      "0.4718767091281231 POS\n",
      "0.4834425032082328 POS\n",
      "0.4495740968272699 POS\n",
      "0.5 POS\n",
      "0.4921850945396838 POS\n",
      "0.4299583092960985 POS\n",
      "0.4630064025226982 POS\n",
      "0.48080413109323766 POS\n",
      "0.48870822043438505 POS\n",
      "0.49098693324083004 POS\n",
      "0.4871322635227914 POS\n",
      "0.4129693484406519 POS\n",
      "0.45407638920623516 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.4388757628484341 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "--------\n",
      "tense_mood_voice\n",
      "{'C': 1, 'kernel': 'linear'} 401 401 (401, 9)\n",
      "0.5 POS\n",
      "0.494703329378081 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.4932656490395275 POS\n",
      "0.5 POS\n",
      "0.4932757483820197 POS\n",
      "0.4759609618034811 POS\n",
      "0.479088858472746 POS\n",
      "0.5 POS\n",
      "0.48624274742589857 POS\n",
      "0.4816466481610728 POS\n",
      "0.5 POS\n",
      "0.4900775569159938 POS\n",
      "0.4915539417600603 POS\n",
      "0.4880956964489165 POS\n",
      "0.4874248565269051 POS\n",
      "0.5 POS\n",
      "--------\n",
      "pos_tense\n",
      "Train -- Other: (401, 1) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 39) & test: (0,)\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 39)\n",
      "0.5064246510561461 NEG\n",
      "0.5139945113239737 NEG\n",
      "0.506934533322071 NEG\n",
      "0.5070904964530005 NEG\n",
      "--------\n",
      "pos_mood\n",
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 42)\n",
      "0.5118627642814815 NEG\n",
      "0.5053613967706662 NEG\n",
      "0.5 POS\n",
      "--------\n",
      "pos_voice\n",
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 42)\n",
      "0.5123196714923471 NEG\n",
      "0.5120912545066432 NEG\n",
      "0.5 POS\n",
      "--------\n",
      "pos_tense_mood_voice\n",
      "Train -- Other: (401, 9) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 47) & test: (0,)\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 47)\n",
      "0.5101320477456974 NEG\n",
      "0.5072234908816923 NEG\n",
      "0.5 POS\n",
      "--------\n",
      "pos_tense_mood_voice_quoted\n",
      "Train -- Other: (401, 10) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 48) & test: (0,)\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 48)\n",
      "0.5 POS\n",
      "0.49370216207330925 POS\n",
      "--------\n",
      "pos_dep_tense_mood_voice\n",
      "Train -- tmv: (401, 9) | pos: (401, 38) | dep: (401, 45)\n",
      "Test -- tmv: (0,) | pos: (0, 38) | dep: (0, 45)\n",
      "Combined shape - train: (401, 92) & test: (0,)\n",
      "{'C': 1, 'kernel': 'linear'} 401 401 (401, 92)\n",
      "0.5 POS\n",
      "0.4910693220687494 POS\n",
      "0.48492947976283857 POS\n",
      "0.48568030072526713 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.5 POS\n",
      "0.4924529010222227 POS\n",
      "0.5 POS\n",
      "0.48862380523110827 POS\n",
      "0.4839644591379846 POS\n",
      "0.4766331184602793 POS\n",
      "0.47970299058939586 POS\n",
      "--------\n",
      "all_categories\n",
      "Train -- tmv-quoted: (401, 10) | dep: (401, 45) | pos: (401, 38) | word: (401, 100)\n",
      "Test -- tmv-quoted: (0,) | dep: (0, 45) | pos: (0, 38) | word: (0, 100)\n",
      "Combined shape - train: (401, 193) & test: (0,)\n",
      "{'C': 0.01, 'kernel': 'linear'} 401 401 (401, 193)\n",
      "0.5 POS\n",
      "0.4926525449827451 POS\n",
      "0.49456767402852786 POS\n",
      "0.49213179081285435 POS\n",
      "--------\n"
     ]
    }
   ],
   "source": [
    "for name in features:\n",
    "    print(name)\n",
    "    run_experiments('svm', name)\n",
    "    print(\"--------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(401, 73)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>genre</th>\n",
       "      <th>filename</th>\n",
       "      <th>avg_overall</th>\n",
       "      <th>Within401____rf____pos1</th>\n",
       "      <th>Within401____rf____pos2</th>\n",
       "      <th>Within401____rf____pos3</th>\n",
       "      <th>Within401____rf____pos23</th>\n",
       "      <th>Within401____rf____word1</th>\n",
       "      <th>Within401____rf____word2</th>\n",
       "      <th>...</th>\n",
       "      <th>Within401____svm____mood</th>\n",
       "      <th>Within401____svm____voice</th>\n",
       "      <th>Within401____svm____tense_mood_voice</th>\n",
       "      <th>Within401____svm____pos_tense</th>\n",
       "      <th>Within401____svm____pos_mood</th>\n",
       "      <th>Within401____svm____pos_voice</th>\n",
       "      <th>Within401____svm____pos_tense_mood_voice</th>\n",
       "      <th>Within401____svm____pos_tense_mood_voice_quoted</th>\n",
       "      <th>Within401____svm____pos_dep_tense_mood_voice</th>\n",
       "      <th>Within401____svm____all_categories</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>- .~\\r\\nhi -\\r\\nfâ€˜. w\\r\\nHUMMINGBIRDS. 215\\r...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086857689-norm.txt</td>\n",
       "      <td>2.666667</td>\n",
       "      <td>0.448108</td>\n",
       "      <td>0.424</td>\n",
       "      <td>0.381860</td>\n",
       "      <td>0.420023</td>\n",
       "      <td>0.327730</td>\n",
       "      <td>0.484688</td>\n",
       "      <td>...</td>\n",
       "      <td>0.372718</td>\n",
       "      <td>3.896500e-01</td>\n",
       "      <td>3.057980e-01</td>\n",
       "      <td>0.287683</td>\n",
       "      <td>0.291664</td>\n",
       "      <td>0.289072</td>\n",
       "      <td>0.293827</td>\n",
       "      <td>0.290459</td>\n",
       "      <td>0.258537</td>\n",
       "      <td>0.102668</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>,22 For ndvertisinir J'Tid other expermos inci...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_dul1-ark-13960-t2q53cp26-norm.txt</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.328137</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.157586</td>\n",
       "      <td>0.163924</td>\n",
       "      <td>0.280059</td>\n",
       "      <td>0.486874</td>\n",
       "      <td>...</td>\n",
       "      <td>0.163641</td>\n",
       "      <td>1.000000e-07</td>\n",
       "      <td>1.000000e-07</td>\n",
       "      <td>0.042644</td>\n",
       "      <td>0.056466</td>\n",
       "      <td>0.015624</td>\n",
       "      <td>0.023885</td>\n",
       "      <td>0.021508</td>\n",
       "      <td>0.011072</td>\n",
       "      <td>0.000103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>;\\r\\nFor the particulars of the warfare carrie...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_hvd-32044009984873-norm.txt</td>\n",
       "      <td>2.111111</td>\n",
       "      <td>0.211849</td>\n",
       "      <td>0.376</td>\n",
       "      <td>0.370150</td>\n",
       "      <td>0.363000</td>\n",
       "      <td>0.344209</td>\n",
       "      <td>0.485308</td>\n",
       "      <td>...</td>\n",
       "      <td>0.126247</td>\n",
       "      <td>5.587968e-01</td>\n",
       "      <td>2.034130e-01</td>\n",
       "      <td>0.116507</td>\n",
       "      <td>0.127819</td>\n",
       "      <td>0.135610</td>\n",
       "      <td>0.141850</td>\n",
       "      <td>0.136054</td>\n",
       "      <td>0.388536</td>\n",
       "      <td>0.343973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>.\\\\r\\nThe lower hackle feathers and those of t...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_coo-31924003689696-norm.txt</td>\n",
       "      <td>2.555556</td>\n",
       "      <td>0.317469</td>\n",
       "      <td>0.388</td>\n",
       "      <td>0.281614</td>\n",
       "      <td>0.319320</td>\n",
       "      <td>0.377767</td>\n",
       "      <td>0.500396</td>\n",
       "      <td>...</td>\n",
       "      <td>0.433406</td>\n",
       "      <td>3.963675e-01</td>\n",
       "      <td>3.618755e-01</td>\n",
       "      <td>0.139004</td>\n",
       "      <td>0.146435</td>\n",
       "      <td>0.164269</td>\n",
       "      <td>0.167478</td>\n",
       "      <td>0.161899</td>\n",
       "      <td>0.191254</td>\n",
       "      <td>0.161185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>\" There's not much trying needed on\\r\\none sid...</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_dul1-ark-13960-t7gq7nw7v-norm.txt</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>0.607992</td>\n",
       "      <td>0.578</td>\n",
       "      <td>0.557512</td>\n",
       "      <td>0.606257</td>\n",
       "      <td>0.548024</td>\n",
       "      <td>0.469306</td>\n",
       "      <td>...</td>\n",
       "      <td>0.539532</td>\n",
       "      <td>5.726618e-01</td>\n",
       "      <td>6.062965e-01</td>\n",
       "      <td>0.625994</td>\n",
       "      <td>0.629641</td>\n",
       "      <td>0.657209</td>\n",
       "      <td>0.646317</td>\n",
       "      <td>0.649761</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.681501</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 73 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text      genre  \\\n",
       "0  - .~\\r\\nhi -\\r\\nfâ€˜. w\\r\\nHUMMINGBIRDS. 215\\r...  19CNONFIC   \n",
       "1  ,22 For ndvertisinir J'Tid other expermos inci...  19CNONFIC   \n",
       "2  ;\\r\\nFor the particulars of the warfare carrie...  19CNONFIC   \n",
       "3  .\\\\r\\nThe lower hackle feathers and those of t...  19CNONFIC   \n",
       "4  \" There's not much trying needed on\\r\\none sid...  19CNONFIC   \n",
       "\n",
       "                                         filename  avg_overall  \\\n",
       "0             19CNONFIC_5S_chi-086857689-norm.txt     2.666667   \n",
       "1  19CNONFIC_5S_dul1-ark-13960-t2q53cp26-norm.txt     1.000000   \n",
       "2        19CNONFIC_5S_hvd-32044009984873-norm.txt     2.111111   \n",
       "3        19CNONFIC_5S_coo-31924003689696-norm.txt     2.555556   \n",
       "4  19CNONFIC_5S_dul1-ark-13960-t7gq7nw7v-norm.txt     4.000000   \n",
       "\n",
       "   Within401____rf____pos1  Within401____rf____pos2  Within401____rf____pos3  \\\n",
       "0                 0.448108                    0.424                 0.381860   \n",
       "1                 0.328137                    0.250                 0.157586   \n",
       "2                 0.211849                    0.376                 0.370150   \n",
       "3                 0.317469                    0.388                 0.281614   \n",
       "4                 0.607992                    0.578                 0.557512   \n",
       "\n",
       "   Within401____rf____pos23  Within401____rf____word1  \\\n",
       "0                  0.420023                  0.327730   \n",
       "1                  0.163924                  0.280059   \n",
       "2                  0.363000                  0.344209   \n",
       "3                  0.319320                  0.377767   \n",
       "4                  0.606257                  0.548024   \n",
       "\n",
       "   Within401____rf____word2  ...  Within401____svm____mood  \\\n",
       "0                  0.484688  ...                  0.372718   \n",
       "1                  0.486874  ...                  0.163641   \n",
       "2                  0.485308  ...                  0.126247   \n",
       "3                  0.500396  ...                  0.433406   \n",
       "4                  0.469306  ...                  0.539532   \n",
       "\n",
       "   Within401____svm____voice  Within401____svm____tense_mood_voice  \\\n",
       "0               3.896500e-01                          3.057980e-01   \n",
       "1               1.000000e-07                          1.000000e-07   \n",
       "2               5.587968e-01                          2.034130e-01   \n",
       "3               3.963675e-01                          3.618755e-01   \n",
       "4               5.726618e-01                          6.062965e-01   \n",
       "\n",
       "   Within401____svm____pos_tense  Within401____svm____pos_mood  \\\n",
       "0                       0.287683                      0.291664   \n",
       "1                       0.042644                      0.056466   \n",
       "2                       0.116507                      0.127819   \n",
       "3                       0.139004                      0.146435   \n",
       "4                       0.625994                      0.629641   \n",
       "\n",
       "   Within401____svm____pos_voice  Within401____svm____pos_tense_mood_voice  \\\n",
       "0                       0.289072                                  0.293827   \n",
       "1                       0.015624                                  0.023885   \n",
       "2                       0.135610                                  0.141850   \n",
       "3                       0.164269                                  0.167478   \n",
       "4                       0.657209                                  0.646317   \n",
       "\n",
       "   Within401____svm____pos_tense_mood_voice_quoted  \\\n",
       "0                                         0.290459   \n",
       "1                                         0.021508   \n",
       "2                                         0.136054   \n",
       "3                                         0.161899   \n",
       "4                                         0.649761   \n",
       "\n",
       "   Within401____svm____pos_dep_tense_mood_voice  \\\n",
       "0                                      0.258537   \n",
       "1                                      0.011072   \n",
       "2                                      0.388536   \n",
       "3                                      0.191254   \n",
       "4                                      0.500000   \n",
       "\n",
       "   Within401____svm____all_categories  \n",
       "0                            0.102668  \n",
       "1                            0.000103  \n",
       "2                            0.343973  \n",
       "3                            0.161185  \n",
       "4                            0.681501  \n",
       "\n",
       "[5 rows x 73 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(annotated_df.shape)\n",
    "annotated_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "##annotated_df.to_csv('../../new-results/CrossValPredictions_AllFeatureSpaces_3Algos.tsv', index=None, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pos-mood:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train -- Other: (401, 4) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 42) & test: (0,)\n",
      "401 401 (401, 42)\n",
      "401 401\n"
     ]
    }
   ],
   "source": [
    "X, _ = vectorizer.pos_mood(fnames, [])\n",
    "algo = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)\n",
    "print(len(fnames), len(Y), X.shape)\n",
    "\n",
    "predictions = cross_val_predict(algo, X, Y, cv=5, method='predict')\n",
    "predictions_prob = cross_val_predict(algo, X, Y, cv=5, method='predict_proba')\n",
    "print(len(predictions), len(predictions_prob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1: 0.7527 | Precision and Recall: 0.7821 0.7254\n"
     ]
    }
   ],
   "source": [
    "f1 = round(f1_score(Y, predictions, pos_label='POS'), 4)\n",
    "prec = round(precision_score(Y, predictions, pos_label='POS'), 4)\n",
    "rec = round(recall_score(Y, predictions, pos_label='POS'), 4)\n",
    "print(\"F1:\", f1, \"| Precision and Recall:\", prec, rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "      <th>ID</th>\n",
       "      <th>Within401_PosTense_Prob_Narrative</th>\n",
       "      <th>Within401_AllCategories_Prob_Narrative</th>\n",
       "      <th>Within401_PosMood_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\r\\n“Dinnseanchus, ” ...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.012</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232556</td>\n",
       "      <td>0.100</td>\n",
       "      <td>0.171560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.004</td>\n",
       "      <td>4</td>\n",
       "      <td>0.236963</td>\n",
       "      <td>0.182</td>\n",
       "      <td>0.248461</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "\n",
       "                                                TEXT  Avg_Reader_Score  \\\n",
       "0  It is referred to in the \\r\\n“Dinnseanchus, ” ...          2.000000   \n",
       "1  It is said to be imbricated. Compare the peria...          1.333333   \n",
       "\n",
       "   Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                    0.10   \n",
       "1                                    0.19   \n",
       "\n",
       "   Trained12k_5features_Prob_Narrative  ID  Within401_PosTense_Prob_Narrative  \\\n",
       "0                                0.012   3                           0.232556   \n",
       "1                                0.004   4                           0.236963   \n",
       "\n",
       "   Within401_AllCategories_Prob_Narrative  Within401_PosMood_Prob_Narrative  \n",
       "0                                   0.100                          0.171560  \n",
       "1                                   0.182                          0.248461  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map_fname_prob = {}\n",
    "for fname, pred, probabilities in zip(fnames, predictions, predictions_prob):\n",
    "    prob = probabilities[1]\n",
    "    \n",
    "    if prob > 0.5: assert pred == 'POS'\n",
    "    else: assert pred == 'NEG'\n",
    "\n",
    "    map_fname_prob[fname] = prob\n",
    "    \n",
    "annotated_df['Within401_PosMood_Prob_Narrative'] = annotated_df['FILENAME'].map(map_fname_prob)\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pos-unigrams:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "401 401 (401, 38)\n",
      "401 401\n"
     ]
    }
   ],
   "source": [
    "X, _ = vectorizer.pos_unigrams(fnames, [])\n",
    "algo = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)\n",
    "print(len(fnames), len(Y), X.shape)\n",
    "\n",
    "predictions = cross_val_predict(algo, X, Y, cv=5, method='predict')\n",
    "predictions_prob = cross_val_predict(algo, X, Y, cv=5, method='predict_proba')\n",
    "print(len(predictions), len(predictions_prob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1: 0.7317 | Precision and Recall: 0.767 0.6995\n"
     ]
    }
   ],
   "source": [
    "f1 = round(f1_score(Y, predictions, pos_label='POS'), 4)\n",
    "prec = round(precision_score(Y, predictions, pos_label='POS'), 4)\n",
    "rec = round(recall_score(Y, predictions, pos_label='POS'), 4)\n",
    "print(\"F1:\", f1, \"| Precision and Recall:\", prec, rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "      <th>ID</th>\n",
       "      <th>Within401_PosTense_Prob_Narrative</th>\n",
       "      <th>Within401_AllCategories_Prob_Narrative</th>\n",
       "      <th>Within401_PosMood_Prob_Narrative</th>\n",
       "      <th>Within401_PosUnigrams_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\r\\n“Dinnseanchus, ” ...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.012</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232556</td>\n",
       "      <td>0.100</td>\n",
       "      <td>0.171560</td>\n",
       "      <td>0.224250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.004</td>\n",
       "      <td>4</td>\n",
       "      <td>0.236963</td>\n",
       "      <td>0.182</td>\n",
       "      <td>0.248461</td>\n",
       "      <td>0.231297</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "\n",
       "                                                TEXT  Avg_Reader_Score  \\\n",
       "0  It is referred to in the \\r\\n“Dinnseanchus, ” ...          2.000000   \n",
       "1  It is said to be imbricated. Compare the peria...          1.333333   \n",
       "\n",
       "   Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                    0.10   \n",
       "1                                    0.19   \n",
       "\n",
       "   Trained12k_5features_Prob_Narrative  ID  Within401_PosTense_Prob_Narrative  \\\n",
       "0                                0.012   3                           0.232556   \n",
       "1                                0.004   4                           0.236963   \n",
       "\n",
       "   Within401_AllCategories_Prob_Narrative  Within401_PosMood_Prob_Narrative  \\\n",
       "0                                   0.100                          0.171560   \n",
       "1                                   0.182                          0.248461   \n",
       "\n",
       "   Within401_PosUnigrams_Prob_Narrative  \n",
       "0                              0.224250  \n",
       "1                              0.231297  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map_fname_prob = {}\n",
    "for fname, pred, probabilities in zip(fnames, predictions, predictions_prob):\n",
    "    prob = probabilities[1]\n",
    "    \n",
    "    if prob > 0.5: assert pred == 'POS'\n",
    "    else: assert pred == 'NEG'\n",
    "\n",
    "    map_fname_prob[fname] = prob\n",
    "    \n",
    "annotated_df['Within401_PosUnigrams_Prob_Narrative'] = annotated_df['FILENAME'].map(map_fname_prob)\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## POS-TMV-PctQuoted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train -- Other: (401, 10) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 48) & test: (0,)\n",
      "401 401 (401, 48)\n",
      "401 401\n"
     ]
    }
   ],
   "source": [
    "X, _ = vectorizer.pos_tmv_quoted(fnames, [])\n",
    "algo = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)\n",
    "print(len(fnames), len(Y), X.shape)\n",
    "\n",
    "predictions = cross_val_predict(algo, X, Y, cv=5, method='predict')\n",
    "predictions_prob = cross_val_predict(algo, X, Y, cv=5, method='predict_proba')\n",
    "print(len(predictions), len(predictions_prob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1: 0.7394 | Precision and Recall: 0.7596 0.7202\n"
     ]
    }
   ],
   "source": [
    "f1 = round(f1_score(Y, predictions, pos_label='POS'), 4)\n",
    "prec = round(precision_score(Y, predictions, pos_label='POS'), 4)\n",
    "rec = round(recall_score(Y, predictions, pos_label='POS'), 4)\n",
    "print(\"F1:\", f1, \"| Precision and Recall:\", prec, rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "      <th>ID</th>\n",
       "      <th>Within401_PosTense_Prob_Narrative</th>\n",
       "      <th>Within401_AllCategories_Prob_Narrative</th>\n",
       "      <th>Within401_PosMood_Prob_Narrative</th>\n",
       "      <th>Within401_PosUnigrams_Prob_Narrative</th>\n",
       "      <th>Within401_PosTMVQuoted_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\r\\n“Dinnseanchus, ” ...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.012</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232556</td>\n",
       "      <td>0.100</td>\n",
       "      <td>0.171560</td>\n",
       "      <td>0.224250</td>\n",
       "      <td>0.150701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.004</td>\n",
       "      <td>4</td>\n",
       "      <td>0.236963</td>\n",
       "      <td>0.182</td>\n",
       "      <td>0.248461</td>\n",
       "      <td>0.231297</td>\n",
       "      <td>0.234420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "\n",
       "                                                TEXT  Avg_Reader_Score  \\\n",
       "0  It is referred to in the \\r\\n“Dinnseanchus, ” ...          2.000000   \n",
       "1  It is said to be imbricated. Compare the peria...          1.333333   \n",
       "\n",
       "   Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                    0.10   \n",
       "1                                    0.19   \n",
       "\n",
       "   Trained12k_5features_Prob_Narrative  ID  Within401_PosTense_Prob_Narrative  \\\n",
       "0                                0.012   3                           0.232556   \n",
       "1                                0.004   4                           0.236963   \n",
       "\n",
       "   Within401_AllCategories_Prob_Narrative  Within401_PosMood_Prob_Narrative  \\\n",
       "0                                   0.100                          0.171560   \n",
       "1                                   0.182                          0.248461   \n",
       "\n",
       "   Within401_PosUnigrams_Prob_Narrative  Within401_PosTMVQuoted_Prob_Narrative  \n",
       "0                              0.224250                               0.150701  \n",
       "1                              0.231297                               0.234420  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map_fname_prob = {}\n",
    "for fname, pred, probabilities in zip(fnames, predictions, predictions_prob):\n",
    "    prob = probabilities[1]\n",
    "    \n",
    "    if prob > 0.5: assert pred == 'POS'\n",
    "    else: assert pred == 'NEG'\n",
    "\n",
    "    map_fname_prob[fname] = prob\n",
    "    \n",
    "annotated_df['Within401_PosTMVQuoted_Prob_Narrative'] = annotated_df['FILENAME'].map(map_fname_prob)\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pos-tense:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train -- Other: (401, 1) | pos: (401, 38)\n",
      "Test -- Other: (0,) | pos: (0, 38)\n",
      "Combined shape - train: (401, 39) & test: (0,)\n",
      "401 401 (401, 39)\n"
     ]
    }
   ],
   "source": [
    "X, _ = vectorizer.pos_tense(fnames, [])\n",
    "algo = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)\n",
    "print(len(fnames), len(Y), X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "401 401\n"
     ]
    }
   ],
   "source": [
    "predictions = cross_val_predict(algo, X, Y, cv=5, method='predict')\n",
    "predictions_prob = cross_val_predict(algo, X, Y, cv=5, method='predict_proba')\n",
    "print(len(predictions), len(predictions_prob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1: 0.7493 | Precision and Recall: 0.7809 0.7202\n"
     ]
    }
   ],
   "source": [
    "f1 = round(f1_score(Y, predictions, pos_label='POS'), 4)\n",
    "prec = round(precision_score(Y, predictions, pos_label='POS'), 4)\n",
    "rec = round(recall_score(Y, predictions, pos_label='POS'), 4)\n",
    "print(\"F1:\", f1, \"| Precision and Recall:\", prec, rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "      <th>ID</th>\n",
       "      <th>Within401_PosTense_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\r\\n“Dinnseanchus, ” ...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.012</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232556</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.004</td>\n",
       "      <td>4</td>\n",
       "      <td>0.236963</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "\n",
       "                                                TEXT  Avg_Reader_Score  \\\n",
       "0  It is referred to in the \\r\\n“Dinnseanchus, ” ...          2.000000   \n",
       "1  It is said to be imbricated. Compare the peria...          1.333333   \n",
       "\n",
       "   Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                    0.10   \n",
       "1                                    0.19   \n",
       "\n",
       "   Trained12k_5features_Prob_Narrative  ID  Within401_PosTense_Prob_Narrative  \n",
       "0                                0.012   3                           0.232556  \n",
       "1                                0.004   4                           0.236963  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map_fname_prob = {}\n",
    "for fname, pred, probabilities in zip(fnames, predictions, predictions_prob):\n",
    "    prob = probabilities[1]\n",
    "    \n",
    "    if prob > 0.5: assert pred == 'POS'\n",
    "    else: assert pred == 'NEG'\n",
    "\n",
    "    map_fname_prob[fname] = prob\n",
    "    \n",
    "annotated_df['Within401_PosTense_Prob_Narrative'] = annotated_df['FILENAME'].map(map_fname_prob)\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model 2 -- all-categories:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train -- tmv-quoted: (401, 10) | dep: (401, 45) | pos: (401, 38) | word: (401, 100)\n",
      "Test -- tmv-quoted: (0,) | dep: (0, 45) | pos: (0, 38) | word: (0, 100)\n",
      "Combined shape - train: (401, 193) & test: (0,)\n",
      "401 401 (401, 193)\n"
     ]
    }
   ],
   "source": [
    "X, _ = vectorizer.all_feature_categories_uni(fnames, [])\n",
    "algo = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=42)\n",
    "print(len(fnames), len(Y), X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "401 401\n"
     ]
    }
   ],
   "source": [
    "predictions = cross_val_predict(algo, X, Y, cv=5, method='predict')\n",
    "predictions_prob = cross_val_predict(algo, X, Y, cv=5, method='predict_proba')\n",
    "print(len(predictions), len(predictions_prob))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1: 0.7306 | Precision and Recall: 0.7306 0.7306\n"
     ]
    }
   ],
   "source": [
    "f1 = round(f1_score(Y, predictions, pos_label='POS'), 4)\n",
    "prec = round(precision_score(Y, predictions, pos_label='POS'), 4)\n",
    "rec = round(recall_score(Y, predictions, pos_label='POS'), 4)\n",
    "print(\"F1:\", f1, \"| Precision and Recall:\", prec, rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "      <th>ID</th>\n",
       "      <th>Within401_PosTense_Prob_Narrative</th>\n",
       "      <th>Within401_AllCategories_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\r\\n“Dinnseanchus, ” ...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.012</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232556</td>\n",
       "      <td>0.100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.004</td>\n",
       "      <td>4</td>\n",
       "      <td>0.236963</td>\n",
       "      <td>0.182</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       GENRE                             FILENAME  \\\n",
       "0  19CNONFIC  19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "1  19CNONFIC  19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "\n",
       "                                                TEXT  Avg_Reader_Score  \\\n",
       "0  It is referred to in the \\r\\n“Dinnseanchus, ” ...          2.000000   \n",
       "1  It is said to be imbricated. Compare the peria...          1.333333   \n",
       "\n",
       "   Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                    0.10   \n",
       "1                                    0.19   \n",
       "\n",
       "   Trained12k_5features_Prob_Narrative  ID  Within401_PosTense_Prob_Narrative  \\\n",
       "0                                0.012   3                           0.232556   \n",
       "1                                0.004   4                           0.236963   \n",
       "\n",
       "   Within401_AllCategories_Prob_Narrative  \n",
       "0                                   0.100  \n",
       "1                                   0.182  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map_fname_prob = {}\n",
    "for fname, pred, probabilities in zip(fnames, predictions, predictions_prob):\n",
    "    prob = probabilities[1]\n",
    "    \n",
    "    if prob > 0.5: assert pred == 'POS'\n",
    "    else: assert pred == 'NEG'\n",
    "\n",
    "    map_fname_prob[fname] = prob\n",
    "    \n",
    "annotated_df['Within401_AllCategories_Prob_Narrative'] = annotated_df['FILENAME'].map(map_fname_prob)\n",
    "annotated_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "##annotated_df.to_csv('../../results/Predictions_Top5Models.tsv', index=None, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model 2:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# genres initially: 19\n",
      "Post-filtering, # genres: 18\n",
      "(416, 30) (105, 5) (450, 3) | Total reader-annotated files: 971\n",
      "Total Mispredictions: 1090 | From: POS-TMV_13438_predictions.tsv\n",
      "Dataset size: (12348, 5)\n"
     ]
    }
   ],
   "source": [
    "train_fnames, train_Y = data_loader.load_data(remove_annotated_passages=True, remove_mispreds=True)\n",
    "\n",
    "annotated_fnames = annotated_df['FILENAME'].tolist()\n",
    "\n",
    "map_annotated_reader = data_loader.reader_annotated_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "algo = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42) # the best pos-TMV parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total Features: 5 | TMV: ['concreteness', 'agenthood'] | POS: ['nn', 'vbz', 'vbd']\n",
      "POS Train: (12348, 3) | POS Test: (416, 3) | POS feature-columns: ['nn', 'vbd', 'vbz']\n",
      "Train files: 12348 | Annotated files: 416 (12348, 5) 416 | (416, 5)\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test = best_model.top_n_model(train_fnames, annotated_fnames, N=5)\n",
    "print(\"Train files:\", len(train_fnames), \"| Annotated files:\", len(annotated_fnames), X_train.shape, len(Y), \"|\", X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ordering: ['NEG', 'POS'] | Predictions: 416 416\n"
     ]
    }
   ],
   "source": [
    "algo.fit(X_train, train_Y)\n",
    "pred_probs = algo.predict_proba(X_test)\n",
    "predictions = algo.predict(X_test)\n",
    "print(\"Ordering:\", algo.classes_.tolist(), \"| Predictions:\", len(pred_probs), len(predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "map_fname_probnarr = {}\n",
    "\n",
    "for fname, probs, pred in zip(annotated_fnames, pred_probs, predictions):\n",
    "    prob_narr = probs[1] # second element (['NEG', 'POS'])\n",
    "    if prob_narr > 0.5: assert pred == 'POS'\n",
    "    else: assert pred == 'NEG'\n",
    "        \n",
    "    map_fname_probnarr[fname] = prob_narr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_test = []\n",
    "for fname in annotated_fnames:\n",
    "    score = map_annotated_reader[fname]\n",
    "    \n",
    "    if score >= 2.5:\n",
    "        Y_test.append('POS')\n",
    "    else:\n",
    "        Y_test.append('NEG')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7692 0.6992 0.8549\n"
     ]
    }
   ],
   "source": [
    "f1 = round(f1_score(Y_test, predictions, pos_label='POS'), 4)\n",
    "prec = round(precision_score(Y_test, predictions, pos_label='POS'), 4)\n",
    "rec = round(recall_score(Y_test, predictions, pos_label='POS'), 4)\n",
    "print(f1, prec, rec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotated_df['Trained12k_5features_Prob_Narrative'] = annotated_df['FILENAME'].map(map_fname_probnarr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENRE</th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>TEXT</th>\n",
       "      <th>Avg_Reader_Score</th>\n",
       "      <th>Within416_AllCategories_Prob_Narrative</th>\n",
       "      <th>Trained12k_5features_Prob_Narrative</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086596372-norm.txt</td>\n",
       "      <td>- 2 \\n. W \\n- . \"*\"' - 32. The coeﬃcients ſoun...</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.116</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086756702-norm.txt</td>\n",
       "      <td>It is referred to in the \\n“Dinnseanchus, ” an...</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.100</td>\n",
       "      <td>0.012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086785002-norm.txt</td>\n",
       "      <td>It is said to be imbricated. Compare the peria...</td>\n",
       "      <td>1.333333</td>\n",
       "      <td>0.190</td>\n",
       "      <td>0.004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086857689-norm.txt</td>\n",
       "      <td>- .~ \\nhi - \\nf‘. w \\nHUMMINGBIRDS. 215 \\n\\n\\n...</td>\n",
       "      <td>2.666667</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>19CNONFIC</td>\n",
       "      <td>19CNONFIC_5S_chi-086950380-norm.txt</td>\n",
       "      <td>— - I was fast slipping \\naway from paralysis,...</td>\n",
       "      <td>4.444444</td>\n",
       "      <td>0.790</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>411</td>\n",
       "      <td>NOVEL19C</td>\n",
       "      <td>NOVEL19C_5S_1-EN_1903_Norris-Frank_ThePit_Nove...</td>\n",
       "      <td>The janitor or sexton, a severe old fellow, wh...</td>\n",
       "      <td>4.555556</td>\n",
       "      <td>0.524</td>\n",
       "      <td>0.898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>412</td>\n",
       "      <td>NOVEL19C</td>\n",
       "      <td>NOVEL19C_5S_1-EN_1904_Murfree-MaryNoailles_The...</td>\n",
       "      <td>His face, albeit no stranger to the use of the...</td>\n",
       "      <td>3.777778</td>\n",
       "      <td>0.686</td>\n",
       "      <td>0.404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>413</td>\n",
       "      <td>NOVEL19C</td>\n",
       "      <td>NOVEL19C_5S_1-EN_1905_Orczy-Emma_TheScarletPim...</td>\n",
       "      <td>\"That impudent Scarlet Pimpernel would slip th...</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>0.698</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>414</td>\n",
       "      <td>NOVEL19C</td>\n",
       "      <td>NOVEL19C_5S_1-EN_1906_London-Jack_WhiteFang_No...</td>\n",
       "      <td>It was a big bull they first found. Here was m...</td>\n",
       "      <td>3.666667</td>\n",
       "      <td>0.648</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>415</td>\n",
       "      <td>NOVEL19C</td>\n",
       "      <td>NOVEL19C_5S_1-EN_1906_Sinclair-Upton_TheJungle...</td>\n",
       "      <td>She had apparently forgotten all about him, an...</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>0.824</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>416 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         GENRE                                           FILENAME  \\\n",
       "0    19CNONFIC                19CNONFIC_5S_chi-086596372-norm.txt   \n",
       "1    19CNONFIC                19CNONFIC_5S_chi-086756702-norm.txt   \n",
       "2    19CNONFIC                19CNONFIC_5S_chi-086785002-norm.txt   \n",
       "3    19CNONFIC                19CNONFIC_5S_chi-086857689-norm.txt   \n",
       "4    19CNONFIC                19CNONFIC_5S_chi-086950380-norm.txt   \n",
       "..         ...                                                ...   \n",
       "411   NOVEL19C  NOVEL19C_5S_1-EN_1903_Norris-Frank_ThePit_Nove...   \n",
       "412   NOVEL19C  NOVEL19C_5S_1-EN_1904_Murfree-MaryNoailles_The...   \n",
       "413   NOVEL19C  NOVEL19C_5S_1-EN_1905_Orczy-Emma_TheScarletPim...   \n",
       "414   NOVEL19C  NOVEL19C_5S_1-EN_1906_London-Jack_WhiteFang_No...   \n",
       "415   NOVEL19C  NOVEL19C_5S_1-EN_1906_Sinclair-Upton_TheJungle...   \n",
       "\n",
       "                                                  TEXT  Avg_Reader_Score  \\\n",
       "0    - 2 \\n. W \\n- . \"*\"' - 32. The coeﬃcients ſoun...          0.777778   \n",
       "1    It is referred to in the \\n“Dinnseanchus, ” an...          2.000000   \n",
       "2    It is said to be imbricated. Compare the peria...          1.333333   \n",
       "3    - .~ \\nhi - \\nf‘. w \\nHUMMINGBIRDS. 215 \\n\\n\\n...          2.666667   \n",
       "4    — - I was fast slipping \\naway from paralysis,...          4.444444   \n",
       "..                                                 ...               ...   \n",
       "411  The janitor or sexton, a severe old fellow, wh...          4.555556   \n",
       "412  His face, albeit no stranger to the use of the...          3.777778   \n",
       "413  \"That impudent Scarlet Pimpernel would slip th...          3.000000   \n",
       "414  It was a big bull they first found. Here was m...          3.666667   \n",
       "415  She had apparently forgotten all about him, an...          4.000000   \n",
       "\n",
       "     Within416_AllCategories_Prob_Narrative  \\\n",
       "0                                     0.116   \n",
       "1                                     0.100   \n",
       "2                                     0.190   \n",
       "3                                     0.250   \n",
       "4                                     0.790   \n",
       "..                                      ...   \n",
       "411                                   0.524   \n",
       "412                                   0.686   \n",
       "413                                   0.698   \n",
       "414                                   0.648   \n",
       "415                                   0.824   \n",
       "\n",
       "     Trained12k_5features_Prob_Narrative  \n",
       "0                                  0.000  \n",
       "1                                  0.012  \n",
       "2                                  0.004  \n",
       "3                                  0.014  \n",
       "4                                  1.000  \n",
       "..                                   ...  \n",
       "411                                0.898  \n",
       "412                                0.404  \n",
       "413                                1.000  \n",
       "414                                1.000  \n",
       "415                                1.000  \n",
       "\n",
       "[416 rows x 6 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotated_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotated_df.to_csv('../../Predicitons_Top2Models.tsv', index=None, sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# fin."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
